knitr::opts_chunk$set(echo = TRUE)
# libraries
library(tidyverse)
library(dplyr)
library(readxl)
library(ggplot2)
library(wesanderson)
library(RColorBrewer)
library(lattice)
library(car)
library(MASS)
library(broom)
# set directory
setwd("~/Desktop/Cerner Data")
# import data
asp_raw <- read_csv("/Users/brittanymorgan/Desktop/Cerner Data/Aspergillosis_3_6_25/asp_cohort_w_denominators_first_enc_year_uncollapsed.csv")
# weighted data
weights <- read_csv("/Users/brittanymorgan/Desktop/Cerner Data/Aspergillosis_3_6_25/For_Brittany_adults_weighted_cerner_data_12Mar25.csv")
forweights <- read_csv("/Users/brittanymorgan/Desktop/Cerner Data/Aspergillosis_3_6_25/forweights.csv")
glimpse(asp_raw)
## Rows: 258,882
## Columns: 12
## $ ...1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ year <dbl> 2023, 2022, 2016, 2018, 2023, 2017, 2014, 2017, 2022, 20…
## $ prefstate <chr> "OR", "NM", "HI", "ND", "NJ", "TX", "PA", "KY", "ME", "I…
## $ prefurban <chr> "rural", "urban", "urban", "urban", "urban", "urban", "u…
## $ prefrace <chr> "White", "White", "Native Hawaiian or Other Pacific Isla…
## $ prefethnicity <chr> "Non-Hispanic", "Non-Hispanic", "Hispanic or Latino", "N…
## $ prefgender <chr> "Female", "Male", "Male", "Female", "Female", "Female", …
## $ age_group <chr> "65 and Over", "35 to 64", "35 to 64", "35 to 64", "18 t…
## $ n_rwdpts <dbl> 11011, 3183, 31, 37, 5694, 3611, 2316, 149, 70, 310, 311…
## $ n_rwdenc <dbl> 83713, 17773, 237, 146, 34244, 15949, 7404, 212, 369, 10…
## $ n_fungal <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ n_fungalenc <dbl> 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
summary(asp_raw)
## ...1 year prefstate prefurban
## Min. : 0 Min. :2013 Length:258882 Length:258882
## 1st Qu.: 64720 1st Qu.:2016 Class :character Class :character
## Median :129440 Median :2018 Mode :character Mode :character
## Mean :129440 Mean :2018
## 3rd Qu.:194161 3rd Qu.:2021
## Max. :258881 Max. :2023
## prefrace prefethnicity prefgender age_group
## Length:258882 Length:258882 Length:258882 Length:258882
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## n_rwdpts n_rwdenc n_fungal n_fungalenc
## Min. : 1.0 Min. : 1 Min. : 0.00000 Min. : 0.0000
## 1st Qu.: 2.0 1st Qu.: 5 1st Qu.: 0.00000 1st Qu.: 0.0000
## Median : 10.0 Median : 29 Median : 0.00000 Median : 0.0000
## Mean : 492.2 Mean : 2178 Mean : 0.08021 Mean : 0.2077
## 3rd Qu.: 71.0 3rd Qu.: 217 3rd Qu.: 0.00000 3rd Qu.: 0.0000
## Max. :153027.0 Max. :908579 Max. :96.00000 Max. :189.0000
asp <- asp_raw %>%
dplyr::select(year, state = prefstate, urban = prefurban, race = prefrace,
ethnicity = prefethnicity, gender = prefgender, age = age_group,
n_rwdpts, n_fungal)
# how many patient-years total?
asp %>%
summarize(n = sum(n_rwdpts))
## # A tibble: 1 × 1
## n
## <dbl>
## 1 127414388
p <- ggplot(data = asp, aes(x = as.factor(year), y = n_rwdpts)) +
geom_bar(stat = "identity", width = 0.7, fill = "#91BAB6") +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values = wes_palette("Darjeeling1", n = 1)) +
theme_minimal()
p
# race of all RWD patients
p <- ggplot(data = asp, aes(x = as.factor(year), y = n_rwdpts, fill = race)) +
geom_bar(stat = "identity", width = 0.7) +
scale_y_continuous(labels = scales::comma) +
scale_fill_brewer(palette = "Set2") +
theme_minimal()
p
White or Caucasian individuals make up the majority of Cerner patients captured in RWD, each year. Black or African American as well as some other race and not mapped/missing make up the remaining majority.
# ethnicity of all RWD patients
p <- ggplot(data = asp, aes(x = as.factor(year), y = n_rwdpts, fill = ethnicity)) +
geom_bar(stat = "identity", width = 0.7) +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values = wes_palette("Darjeeling1", n = 4)) +
theme_minimal()
p
Most patients in RWD are not Hispanic or Latino. The number of Hispanic or Latino patients seems to be increasing over time.
# age group of all RWD patients
p <- ggplot(data = asp, aes(x = as.factor(year), y = n_rwdpts, fill = age)) +
geom_bar(stat = "identity", width = 0.7) +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values = wes_palette("Darjeeling1", n = 3)) +
theme_minimal()
p
Most patients in RWD are 35 to 64. The number of patients in all age categories increases over time.
# sex/gender of all RWD patients
p <- ggplot(data = asp, aes(x = as.factor(year), y = n_rwdpts, fill = gender)) +
geom_bar(stat = "identity", width = 0.7) +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values = wes_palette("Darjeeling1", n = 3)) +
theme_minimal()
p
There appears to be slightly more females in RWD than males each year.
# urban/rural of all RWD patients
p <- ggplot(data = asp, aes(x = as.factor(year), y = n_rwdpts, fill = urban)) +
geom_bar(stat = "identity", width = 0.7) +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values = wes_palette("Darjeeling1", n = 3)) +
theme_minimal()
p
The majority of patients live in urban areas.
p <- ggplot(data = asp, aes(x = as.factor(year), y = n_fungal)) +
geom_bar(stat = "identity", width = 0.7, fill = "#91BAB6") +
scale_y_continuous(labels = scales::comma) +
theme_minimal()
p + labs(title = "Distribution of patients with aspergillosis diagnosis over study period",
y = "Number of Patients", x = "Year")
asp_prev <- asp %>%
group_by(year) %>%
summarize(n_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts)) %>%
mutate(rwd_prev = (n_tot/rwd_tot)*100)
p <- ggplot(data = asp_prev, aes(x = as.factor(year), y = rwd_prev, group = 1)) +
geom_line() +
geom_point() +
scale_y_continuous(labels = scales::comma) +
theme_minimal()
p + labs(title = "Prevalence of patients with aspergillosis diagnosis over study period",
y = "Prevalence (%)", x = "Year")
# how many aspergillosis patients total?
asp %>%
summarize(n = sum(n_fungal))
## # A tibble: 1 × 1
## n
## <dbl>
## 1 20764
forweights %>%
summarize(n = sum(n_fungal))
## # A tibble: 1 × 1
## n
## <dbl>
## 1 20764
# how many aspergillosis patients per year?
n_patients <- asp %>%
group_by(year) %>%
summarize(n_tot = sum(n_fungal))
n_patients
## # A tibble: 11 × 2
## year n_tot
## <dbl> <dbl>
## 1 2013 694
## 2 2014 854
## 3 2015 1057
## 4 2016 1388
## 5 2017 1673
## 6 2018 1998
## 7 2019 2197
## 8 2020 2349
## 9 2021 2858
## 10 2022 2879
## 11 2023 2817
# join weights to data
weighted <- left_join(forweights, weights, by = c("year", "state", "gender",
"eth_race", "age_group")) %>%
na.omit()
# annual prevalence
weighted_annual <- weighted %>%
mutate(weight_case = n_fungal*weights_trim,
weight_n = n_rwdpts*weights_trim) %>%
group_by(year) %>%
summarize(fungal_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts),
weight_fungal_tot = sum(weight_case), weight_rwd_tot = sum(weight_n)) %>%
mutate(prevalence = fungal_tot/rwd_tot*100000,
SE = sqrt((weight_fungal_tot/weight_rwd_tot) *
(1 - (weight_fungal_tot/weight_rwd_tot))/weight_rwd_tot)*100000,
ci_lower = prevalence - 1.96*SE,
ci_upper = prevalence + 1.96*SE)
# state overall prevalence
weighted_state <- weighted %>%
mutate(weight_case = n_fungal*weights_trim,
weight_n = n_rwdpts*weights_trim) %>%
group_by(state) %>%
summarize(fungal_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts),
weight_fungal_tot = sum(weight_case), weight_rwd_tot = sum(weight_n)) %>%
mutate(prevalence = (weight_fungal_tot/weight_rwd_tot)*100000)
# overall prevalence
weighted_all <- weighted %>%
mutate(weight_case = n_fungal*weights_trim,
weight_n = n_rwdpts*weights_trim) %>%
summarize(prev = sum(weight_case), denom = sum(weight_n)) %>%
mutate(prevalence = prev/denom*100000)
# graph
p <- ggplot(data = weighted_annual, aes(x = as.factor(year), y = prevalence, group = 1)) +
geom_line() +
geom_point() +
geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2) +
scale_y_continuous(labels = scales::comma, limits = c(0, NA)) +
theme_minimal()
p2 <- p + labs(title = "",
y = "Weighted Prevalence per 100,000 person-years", x = "Year")
p2
# ggsave("/Users/brittanymorgan/Desktop/Cerner Data/Aspergillosis_3_6_25/Plots/weighted_prev.png", plot = p2, dpi = 300)
# map
# bring in state shapefile
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(tmap)
## Breaking News: tmap 3.x is retiring. Please test v4, e.g. with
## remotes::install_github('r-tmap/tmap')
library(tmaptools)
library(stringr)
library(tigris)
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
states <- states(cb = TRUE)
## Retrieving data for the year 2021
## | | | 0% | | | 1% | |= | 1% | |== | 3% | |=== | 4% | |===== | 7% | |====== | 8% | |====== | 9% | |======= | 9% | |============== | 20% | |============================ | 40% | |==================================================== | 74% | |================================================================== | 95% | |======================================================================| 100%
# add state geography
weight_map <- weighted_state %>%
left_join(states, by = c("state" = "STUSPS")) %>%
dplyr::select(state, weight_fungal_tot, weight_rwd_tot, prevalence, geometry) %>%
na.omit()
weight_map <- st_as_sf(x = weight_map)
# split US into three: contiguous, Alaska, and Hawaii
us_cont <- weight_map %>%
filter(!state %in% c("AK", "HI"))
us_al <- weight_map %>%
filter(state == "AK")
us_hi <- weight_map %>%
filter(state == "HI")
# plot contiguous states
weightrt <- tm_shape(us_cont, projection = 2163) +
tm_polygons(col = "prevalence", style = "jenks",
title = "Weighted Prevalence per 100,000 Individuals",
palette = c("#91BAB6", "#BDC881", "#E3B710", "#EC7A05", "#F11B00"),
border.col = "black",
border.alpha = 0.1,
legend.show = FALSE) +
tm_layout(frame = FALSE,
legend.position = c("right", "bottom"),
inner.margins = c(.25, .02, .02, .02)) +
tm_add_legend(type = "fill",
labels = c("0 to <9.5", "9.5 to <13.8", "13.8 to <21.4", "21.4 to <30.1", ">30.1"),
col = c("#91BAB6", "#BDC881", "#E3B710", "#EC7A05", "#F11B00"),
border.lwd = 0.5,
title = "Weighted Prevalence per 100,000 Individuals")
# create inset map of Alaska
weight_ak <- tm_shape(us_al, projection = 3338) +
tm_fill(col = "#91BAB6",
border.col = "black",
border.alpha = 0.1,
legend.show = FALSE) +
tm_layout(frame = FALSE)
# create inset map of Hawaii
weight_hi <- tm_shape(us_hi, projection = 3759) +
tm_fill(col = "#E3B710",
border.col = "black",
border.alpha = 0.1,
legend.show = FALSE) +
tm_layout(frame = FALSE)
# plot insets
library(grid)
weightrt
print(weight_ak, vp = viewport(x = .15, y = .15, width = .3, height = .3))
print(weight_hi, vp = viewport(x = .00001, y = .5, width = .3, height = .3))
# race of aspergillosis patients
p <- ggplot(data = asp, aes(x = as.factor(year), y = n_fungal, fill = race)) +
geom_bar(stat = "identity", width = 0.7) +
scale_y_continuous(labels = scales::comma) +
scale_fill_brewer(palette = "Set2") +
theme_minimal()
p + labs(title = "Distribution of patients with aspergillosis diagnosis race over study period",
y = "Number of Patients", x = "Year")
The majority of aspergillosis patients each year are White or Caucasian. The number of Asian patients with aspergillosis appears to increase slightly over time, as does the number of Black or African American and those of some other race.
# ethnicity of aspergillosis patients
p <- ggplot(data = asp, aes(x = as.factor(year), y = n_fungal, fill = ethnicity)) +
geom_bar(stat = "identity", width = 0.7) +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values = wes_palette("Darjeeling1", n = 4)) +
theme_minimal()
p + labs(title = "Distribution of patients with aspergillosis diagnosis ethnicity over study period",
y = "Number of Patients", x = "Year")
Most aspergillosis patients in RWD are not Hispanic or Latino. The number of Hispanic or Latino patients does increase over time.
# age group of aspergillosis patients
p <- ggplot(data = asp, aes(x = as.factor(year), y = n_fungal, fill = age)) +
geom_bar(stat = "identity", width = 0.7) +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values = wes_palette("Darjeeling1", n = 3)) +
theme_minimal()
p
Most aspergillosis patients are 35 to 64 or 65 and over.
# sex/gender of aspergillosis patients
p <- ggplot(data = asp, aes(x = as.factor(year), y = n_fungal, fill = gender)) +
geom_bar(stat = "identity", width = 0.7) +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values = wes_palette("Darjeeling1", n = 3)) +
theme_minimal()
p + labs(title = "Distribution of patients with aspergillosis diagnosis sex over study period",
y = "Number of Patients", x = "Year")
There appears to be a relatively even split in the number of aspergillosis patients by sex/gender each year.
# race groups
race_grp1 <- asp %>%
group_by(year, race) %>%
summarize(race_fung_tot = sum(n_fungal), race_rwd_tot = sum(n_rwdpts))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
race_grp2 <- asp %>%
group_by(year) %>%
summarize(fung_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts))
race_grp <- left_join(race_grp1, race_grp2, by = "year") %>%
mutate(racefung_of_racetot = (race_fung_tot/race_rwd_tot)*100,
racefung_of_fungtot = (race_fung_tot/fung_tot)*100,
race_of_tot = (race_rwd_tot/rwd_tot)*100) %>%
pivot_longer(cols = c("racefung_of_fungtot", "race_of_tot"),
names_to = "variable",
values_to = "percent")
race_grp_tbl <- race_grp %>%
group_by(race) %>%
summarize(race_fung_tot = sum(race_fung_tot), race_rwd_tot = sum(race_rwd_tot))
ggplot(race_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~race) +
scale_x_discrete(name = "Year",
labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
"'21", "'22", "23")) +
scale_fill_manual(values = wes_palette("Darjeeling2", n = 2),
name = "", labels = c("% of all patients",
"% of aspergillosis patients")) +
theme_minimal()
Very few cases occur in AI/AN, Asian, and NH/API patients - making visual comparisons for these groups difficult. Black or African American patients appear to make up a similar proportion of aspergillosis patients relative to their proportion in the RWD patient totals. White or Caucasian patients appear over represented in aspergillosis cases compared to their proportion in the RWD patient totals for every year. Individuals identifying as some other race appear consistently over represented compared to their RWD patient totals.
# ethnic groups
eth_grp1 <- asp %>%
group_by(year, ethnicity) %>%
summarize(eth_fung_tot = sum(n_fungal), eth_rwd_tot = sum(n_rwdpts))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
eth_grp2 <- asp %>%
group_by(year) %>%
summarize(fung_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts))
eth_grp <- left_join(eth_grp1, eth_grp2, by = "year") %>%
mutate(ethfung_of_ethtot = (eth_fung_tot/eth_rwd_tot)*100,
ethfung_of_fungtot = (eth_fung_tot/fung_tot)*100,
eth_of_tot = (eth_rwd_tot/rwd_tot)*100) %>%
pivot_longer(cols = c("ethfung_of_fungtot", "eth_of_tot"), names_to = "variable",
values_to = "percent")
ggplot(eth_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~ethnicity) +
scale_x_discrete(name = "Year",
labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
"'21", "'22", "23")) +
scale_fill_manual(values = wes_palette("Darjeeling2", n = 2),
name = "", labels = c("% of all patients",
"% of aspergillosis patients")) +
theme_minimal()
There are a few years where patients that are Hispanic or Latino appear over represented in aspergillosis patients. Patients that are not Hispanic or Latino appear over represented in aspergillosis patients most years until 2019.
# sex/gender groups
sex_grp1 <- asp %>%
group_by(year, gender) %>%
summarize(sex_fung_tot = sum(n_fungal), sex_rwd_tot = sum(n_rwdpts))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
sex_grp2 <- asp %>%
group_by(year) %>%
summarize(fung_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts))
sex_grp <- left_join(sex_grp1, sex_grp2, by = "year") %>%
mutate(sexfung_of_sextot = (sex_fung_tot/sex_rwd_tot)*100,
sexfung_of_fungtot = (sex_fung_tot/fung_tot)*100,
sex_of_tot = (sex_rwd_tot/rwd_tot)*100) %>%
pivot_longer(cols = c("sexfung_of_fungtot", "sex_of_tot"), names_to = "variable",
values_to = "percent")
ggplot(sex_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~gender) +
scale_x_discrete(name = "Year",
labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
"'21", "'22", "23")) +
scale_fill_manual(values = wes_palette("Darjeeling2", n = 2),
name = "", labels = c("% of all patients",
"% of aspergillosis patients")) +
theme_minimal()
Male patients are over represented in aspergillosis cases. Each year, a larger percent of all aspergillosis patients were male than their percent in the RWD patient universe.
# age groups
age_grp1 <- asp %>%
group_by(year, age) %>%
summarize(age_fung_tot = sum(n_fungal), age_rwd_tot = sum(n_rwdpts))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
age_grp2 <- asp %>%
group_by(year) %>%
summarize(fung_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts))
age_grp <- left_join(age_grp1, age_grp2, by = "year") %>%
mutate(agefung_of_agetot = (age_fung_tot/age_rwd_tot)*100,
agefung_of_fungtot = (age_fung_tot/fung_tot)*100,
age_of_tot = (age_rwd_tot/rwd_tot)*100) %>%
pivot_longer(cols = c("agefung_of_fungtot", "age_of_tot"), names_to = "variable",
values_to = "percent")
ggplot(age_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~age) +
scale_x_discrete(name = "Year",
labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
"'21", "'22", "23")) +
scale_fill_manual(values = wes_palette("Darjeeling2", n = 2),
name = "", labels = c("% of all patients",
"% of aspergillosis patients")) +
theme_minimal()
Patients aged 65 years and older are very over represented in aspergillosis patients every year. Other age groups are under represented
# urban/rural
urban_grp1 <- asp %>%
group_by(year, urban) %>%
summarize(urban_fung_tot = sum(n_fungal), urban_rwd_tot = sum(n_rwdpts))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
urban_grp2 <- asp %>%
group_by(year) %>%
summarize(fung_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts))
urban_grp <- left_join(urban_grp1, urban_grp2, by = "year") %>%
mutate(urbanfung_of_urbantot = (urban_fung_tot/urban_rwd_tot)*100,
urbanfung_of_fungtot = (urban_fung_tot/fung_tot)*100,
urban_of_tot = (urban_rwd_tot/rwd_tot)*100) %>%
pivot_longer(cols = c("urbanfung_of_fungtot", "urban_of_tot"), names_to = "variable",
values_to = "percent")
ggplot(urban_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~urban) +
scale_x_discrete(name = "Year",
labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
"'21", "'22", "23")) +
scale_fill_manual(values = wes_palette("Darjeeling2", n = 2),
name = "", labels = c("% of all patients",
"% of aspergillosis patients")) +
theme_minimal()
# state
state_grp1 <- asp %>%
group_by(year, state) %>%
summarize(state_fung_tot = sum(n_fungal), state_rwd_tot = sum(n_rwdpts))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
state_grp2 <- asp %>%
group_by(year) %>%
summarize(fung_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts))
state_grp <- left_join(state_grp1, state_grp2, by = "year") %>%
mutate(statefung_of_statetot = (state_fung_tot/state_rwd_tot)*100,
statefung_of_fungtot = (state_fung_tot/fung_tot)*100,
state_of_tot = (state_rwd_tot/rwd_tot)*100) %>%
pivot_longer(cols = c("statefung_of_fungtot", "state_of_tot"), names_to = "variable",
values_to = "percent")
ggplot(state_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~state) +
scale_x_discrete(name = "Year",
labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
"'21", "'22", "23")) +
scale_fill_manual(values = wes_palette("Darjeeling2", n = 2),
name = "", labels = c("% of all patients",
"% of aspergillosis patients")) +
theme_classic()
Several states have too few cases or RWD patients to visually compare in this way. Of the states that we can compare, Arizona, California, South Carolina, Kansas for some years, North Carolina for some years, and New Mexico are over represented each year.
# race groups
race_grp <- left_join(race_grp1, race_grp2, by = "year") %>%
mutate(racefung_of_racetot = (race_fung_tot/race_rwd_tot)*100) %>%
pivot_longer(cols = c("racefung_of_racetot"),
names_to = "variable",
values_to = "percent")
ggplot(race_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~race) +
scale_x_discrete(name = "Year",
labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
"'21", "'22", "23")) +
scale_fill_manual(values = wes_palette("Zissou1", n = 1),
name = "", labels = c("% of all race total")) +
theme_minimal()
# remove "unknown" as a category
asp_na <- as.data.frame(asp) %>%
mutate_all(~replace(.,. == "Other or Unknown", NA)) %>%
mutate_all(~replace(.,. == "Unknown", NA)) %>%
mutate(state = as.factor(state), year = as.integer(year), gender = as.factor(gender),
race = as.factor(race), ethnicity = as.factor(ethnicity), age = as.factor(age),
urban = as.factor(urban))
# combine asian/pacific islander to reduce race categories
asp_na$race2 <- recode_factor(asp_na$race,
"American Indian or Alaska Native" = "American Indian or Alaska Native",
"Asian" = "Asian/Pacific Islander",
"Black or African American" = "Black or African American",
"Native Hawaiian or Other Pacific Islander" = "Asian/Pacific Islander",
"Some Other Race" = "Other",
"White or Caucasian" = "White")
# combine counts for new race categories
asp2 <- asp_na %>%
group_by(state, year, gender, ethnicity, age, urban, race2) %>%
summarize(n_fungal = sum(n_fungal),
n_rwdpts = sum(n_rwdpts)) %>%
ungroup()
## `summarise()` has grouped output by 'state', 'year', 'gender', 'ethnicity',
## 'age', 'urban'. You can override using the `.groups` argument.
df <- asp2 %>%
mutate(male = as.factor(case_when(gender == 'Male' ~ 1, TRUE ~ 0)),
rural = as.factor(case_when(urban == 'rural' ~ 1, TRUE ~ 0))) %>%
dplyr::select(state, year, male, race2, ethnicity, rural, age,
n_fungal, n_rwdpts) %>%
filter(n_rwdpts > 0) # remove strata where there are no RWD patients meeting those demographics (n_rwdpts is the offset variable and cannot take the log of 0)
df_re <- asp2 %>%
mutate(race2 = as.factor(race2),
male = as.factor(case_when(gender == 'Male' ~ 1, TRUE ~ 0)),
rural = as.factor(case_when(urban == 'rural' ~ 1, TRUE ~ 0)),
race_eth = case_when(ethnicity == 'Hispanic or Latino' ~ 'Hispanic or Latino',
TRUE ~ race2)) %>%
dplyr::select(state, year, male, race_eth, rural, age, n_fungal, n_rwdpts) %>%
filter(n_rwdpts > 0) # remove strata where there are no RWD patients meeting those demographics (n_rwdpts is the offset variable and cannot take the log of 0)
df$race2 <- relevel(df$race2, ref = "White")
df$ethnicity <- relevel(df$ethnicity, ref = "Non-Hispanic")
df$age <- relevel(df$age, ref = "18 to 34")
df_re$race_eth <- as.factor(df_re$race_eth) %>%
relevel(df_re$race_eth, ref = "White")
df_re$age <- relevel(df$age, ref = "18 to 34")
# covid variables
df$covid <- ifelse(df$year >=2020, 1, 0)
df_re$covid <- ifelse(df$year >= 2020, 1, 0)
df$year_center <- df$year - 2019
df_re$year_center <- df_re$year - 2019
df$year_count <- df$year - 2013
# histogram of number aspergillosis patients
p <- ggplot(data = df, aes(x = n_fungal)) +
geom_histogram(binwidth = 1, fill = "#69b3a2", alpha = 0.9) +
theme_minimal()
p
# race groups
race_grp1 <- df %>%
group_by(year, race2) %>%
summarize(race_fung_tot = sum(n_fungal), race_rwd_tot = sum(n_rwdpts))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
race_grp2 <- df %>%
group_by(year) %>%
summarize(fung_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts))
race_grp <- left_join(race_grp1, race_grp2, by = "year") %>%
mutate(racefung_of_racetot = (race_fung_tot/race_rwd_tot)*100,
racefung_of_fungtot = (race_fung_tot/fung_tot)*100,
race_of_tot = (race_rwd_tot/rwd_tot)*100) %>%
pivot_longer(cols = c("racefung_of_fungtot", "race_of_tot"),
names_to = "variable",
values_to = "percent")
race_grp_tbl <- race_grp %>%
group_by(race2) %>%
summarize(race_fung_tot = sum(race_fung_tot), race_rwd_tot = sum(race_rwd_tot))
race_plot <- ggplot(race_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~race2, scales = "free_y") +
scale_x_discrete(name = "Year",
labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
"'21", "'22", "23")) +
scale_fill_manual(values = wes_palette("Darjeeling2", n = 2),
name = "", labels = c("% of all patients",
"% of aspergillosis patients")) +
theme_minimal()
race_plot
# ggsave("/Users/brittanymorgan/Desktop/Cerner Data/Aspergillosis_3_6_25/Plots/race_plot.png", plot = race_plot, width = 10, dpi = 300)
# ethnic groups
eth_grp1 <- df %>%
group_by(year, ethnicity) %>%
summarize(eth_fung_tot = sum(n_fungal), eth_rwd_tot = sum(n_rwdpts))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
eth_grp2 <- df %>%
group_by(year) %>%
summarize(fung_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts))
eth_grp <- left_join(eth_grp1, eth_grp2, by = "year") %>%
mutate(ethfung_of_ethtot = (eth_fung_tot/eth_rwd_tot)*100,
ethfung_of_fungtot = (eth_fung_tot/fung_tot)*100,
eth_of_tot = (eth_rwd_tot/rwd_tot)*100) %>%
pivot_longer(cols = c("ethfung_of_fungtot", "eth_of_tot"), names_to = "variable",
values_to = "percent")
ethnic_plot <- ggplot(eth_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~ethnicity, scales = "free_y",
labeller = as_labeller(c("Hispanic or Latino" = "Hispanic or Latino",
"Non-Hispanic" = "Non-Hispanic or Latino",
"Unknown" = "Unknown"))) +
scale_x_discrete(name = "Year",
labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
"'21", "'22", "23")) +
scale_fill_manual(values = wes_palette("Darjeeling2", n = 2),
name = "", labels = c("% of all patients",
"% of aspergillosis patients")) +
theme_minimal()
ethnic_plot
# ggsave("/Users/brittanymorgan/Desktop/Cerner Data/Aspergillosis_3_6_25/Plots/ethnic_plot.png", plot = ethnic_plot, width = 10, dpi = 300)
# sex/gender groups
sex_grp1 <- df %>%
group_by(year, male) %>%
summarize(sex_fung_tot = sum(n_fungal), sex_rwd_tot = sum(n_rwdpts))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
sex_grp2 <- df %>%
group_by(year) %>%
summarize(fung_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts))
sex_grp <- left_join(sex_grp1, sex_grp2, by = "year") %>%
mutate(sexfung_of_sextot = (sex_fung_tot/sex_rwd_tot)*100,
sexfung_of_fungtot = (sex_fung_tot/fung_tot)*100,
sex_of_tot = (sex_rwd_tot/rwd_tot)*100) %>%
pivot_longer(cols = c("sexfung_of_fungtot", "sex_of_tot"), names_to = "variable",
values_to = "percent")
sex_plot <- ggplot(sex_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~male, scales = "free_y",
labeller = as_labeller(c("0" = "Female", "1" = "Male"))) +
scale_x_discrete(name = "Year",
labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
"'21", "'22", "23")) +
scale_fill_manual(values = wes_palette("Darjeeling2", n = 2),
name = "", labels = c("% of all patients",
"% of aspergillosis patients")) +
theme_minimal()
sex_plot
# ggsave("/Users/brittanymorgan/Desktop/Cerner Data/Aspergillosis_3_6_25/Plots/sex_plot.png", plot = sex_plot, width = 8, dpi = 300)
# age groups
age_grp1 <- df %>%
group_by(year, age) %>%
summarize(age_fung_tot = sum(n_fungal), age_rwd_tot = sum(n_rwdpts))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
age_grp2 <- df %>%
group_by(year) %>%
summarize(fung_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts))
age_grp <- left_join(age_grp1, age_grp2, by = "year") %>%
mutate(agefung_of_agetot = (age_fung_tot/age_rwd_tot)*100,
agefung_of_fungtot = (age_fung_tot/fung_tot)*100,
age_of_tot = (age_rwd_tot/rwd_tot)*100) %>%
pivot_longer(cols = c("agefung_of_fungtot", "age_of_tot"), names_to = "variable",
values_to = "percent")
age_plot <- ggplot(age_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~age, scales = "free_y") +
scale_x_discrete(name = "Year",
labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
"'21", "'22", "23")) +
scale_fill_manual(values = wes_palette("Darjeeling2", n = 2),
name = "", labels = "") +
theme_minimal()
age_plot
# ggsave("/Users/brittanymorgan/Desktop/Cerner Data/Aspergillosis_3_6_25/Plots/age_plot.png", plot = age_plot, width = 10, dpi = 300)
# urban/rural
urban_grp1 <- df %>%
group_by(year, rural) %>%
summarize(urban_fung_tot = sum(n_fungal), urban_rwd_tot = sum(n_rwdpts))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
urban_grp2 <- df %>%
group_by(year) %>%
summarize(fung_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts))
urban_grp <- left_join(urban_grp1, urban_grp2, by = "year") %>%
mutate(urbanfung_of_urbantot = (urban_fung_tot/urban_rwd_tot)*100,
urbanfung_of_fungtot = (urban_fung_tot/fung_tot)*100,
urban_of_tot = (urban_rwd_tot/rwd_tot)*100) %>%
pivot_longer(cols = c("urbanfung_of_fungtot", "urban_of_tot"), names_to = "variable",
values_to = "percent")
urban_plot <- ggplot(urban_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~rural, scales = "free_y",
labeller = as_labeller(c("0" = "Urban", "1" = "Rural"))) +
scale_x_discrete(name = "Year",
labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
"'21", "'22", "23")) +
scale_fill_manual(values = wes_palette("Darjeeling2", n = 2),
name = "", labels = c("% of all patients",
"% of aspergillosis patients")) +
theme_minimal()
urban_plot
# ggsave("/Users/brittanymorgan/Desktop/Cerner Data/Aspergillosis_3_6_25/Plots/urban_plot.png", plot = urban_plot, width = 8, dpi = 300)
# year
year <- glm(n_fungal ~ year,
family = poisson,
offset = log(n_rwdpts),
data = df)
summary(year)
##
## Call:
## glm(formula = n_fungal ~ year, family = poisson, data = df, offset = log(n_rwdpts))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -87.101583 4.857266 -17.93 <2e-16 ***
## year 0.038819 0.002405 16.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 30602 on 133559 degrees of freedom
## Residual deviance: 30336 on 133558 degrees of freedom
## AIC: 46322
##
## Number of Fisher Scoring iterations: 7
Have quite a few significant years (when compared to 2013)
# gender/sex
sex <- glm(n_fungal ~ male,
family = poisson,
offset = log(n_rwdpts),
data = df)
summary(sex)
##
## Call:
## glm(formula = n_fungal ~ male, family = poisson, data = df, offset = log(n_rwdpts))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.890208 0.009864 -901.29 <2e-16 ***
## male1 0.366160 0.013880 26.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 30602 on 133559 degrees of freedom
## Residual deviance: 29909 on 133558 degrees of freedom
## AIC: 45894
##
## Number of Fisher Scoring iterations: 7
Females significantly less likely to have aspergillosis diagnosis than males
# ethnicity
eth <- glm(n_fungal ~ ethnicity,
family = poisson,
offset = log(n_rwdpts),
data = df)
summary(eth)
##
## Call:
## glm(formula = n_fungal ~ ethnicity, family = poisson, data = df,
## offset = log(n_rwdpts))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.709450 0.007914 -1100.497 <2e-16 ***
## ethnicityHispanic or Latino -0.026910 0.019351 -1.391 0.164
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 25166 on 90622 degrees of freedom
## Residual deviance: 25164 on 90621 degrees of freedom
## (42937 observations deleted due to missingness)
## AIC: 39090
##
## Number of Fisher Scoring iterations: 7
Hispanic or Latino significant (less likely)
# race
mrace <- glm(n_fungal ~ race2,
family = poisson,
offset = log(n_rwdpts),
data = df)
summary(mrace)
##
## Call:
## glm(formula = n_fungal ~ race2, family = poisson, data = df,
## offset = log(n_rwdpts))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.720609 0.008077 -1079.702 < 2e-16
## race2American Indian or Alaska Native -0.171455 0.086740 -1.977 0.048080
## race2Asian/Pacific Islander 0.297584 0.038597 7.710 1.26e-14
## race2Black or African American -0.090271 0.025034 -3.606 0.000311
## race2Other 0.268391 0.022908 11.716 < 2e-16
##
## (Intercept) ***
## race2American Indian or Alaska Native *
## race2Asian/Pacific Islander ***
## race2Black or African American ***
## race2Other ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 27442 on 106876 degrees of freedom
## Residual deviance: 27235 on 106872 degrees of freedom
## (26683 observations deleted due to missingness)
## AIC: 42217
##
## Number of Fisher Scoring iterations: 8
Asian/Pacific Islander, African-American/Black, and other race groups significant
# age
mage <- glm(n_fungal ~ age,
family = poisson,
offset = log(n_rwdpts),
data = df)
summary(mage)
##
## Call:
## glm(formula = n_fungal ~ age, family = poisson, data = df, offset = log(n_rwdpts))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.66893 0.02219 -435.75 <2e-16 ***
## age35 to 64 0.84553 0.02462 34.34 <2e-16 ***
## age65 and Over 1.48513 0.02435 60.99 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 30602 on 133559 degrees of freedom
## Residual deviance: 25517 on 133557 degrees of freedom
## AIC: 41505
##
## Number of Fisher Scoring iterations: 8
Both age groups significant compared to 18 to 34 year olds
# rural
rural <- glm(n_fungal ~ rural,
family = poisson,
offset = log(n_rwdpts),
data = df)
summary(rural)
##
## Call:
## glm(formula = n_fungal ~ rural, family = poisson, data = df,
## offset = log(n_rwdpts))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.696914 0.007561 -1150.264 < 2e-16 ***
## rural1 -0.149385 0.019049 -7.842 4.43e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 30602 on 133559 degrees of freedom
## Residual deviance: 30538 on 133558 degrees of freedom
## AIC: 46523
##
## Number of Fisher Scoring iterations: 7
Rural significantly less likely to have patients with aspergillosis diagnosis
# remove NAs
df1 <- na.omit(df)
df_re1 <- na.omit(df_re)
# number of fungal cases with complete data
p <- ggplot(data = df1, aes(x = as.factor(year), y = n_fungal)) +
geom_bar(stat = "identity", width = 0.7) +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values = wes_palette("Darjeeling1", n = 1)) +
theme_minimal()
p
# number of patients with complete data
df1 %>%
summarize(n = sum(n_rwdpts))
## # A tibble: 1 × 1
## n
## <dbl>
## 1 111203140
# prevalence with complete data
asp_prev_complete <- df1 %>%
group_by(year) %>%
summarize(n_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts)) %>%
mutate(rwd_prev = (n_tot/rwd_tot)*100)
p <- ggplot(data = asp_prev_complete, aes(x = as.factor(year), y = rwd_prev, group = 1)) +
geom_line() +
geom_point() +
scale_y_continuous(labels = scales::comma) +
theme_minimal()
p
pois <- glm(n_fungal ~ race2*covid + ethnicity*covid + rural*covid + male + age +
state + year_center*covid,
family = "poisson",
offset = log(n_rwdpts),
data = df1)
logbin <- glm(formula = cbind(n_fungal, n_rwdpts) ~
race2*covid + ethnicity*covid + rural*covid + male + age +
state + year_center*covid,
data = df1,
family = "binomial"(link = "log"))
qpois <- glm(n_fungal ~ race2*covid + ethnicity*covid + rural*covid + male + age +
state + year_center*covid,
family = "quasipoisson",
offset = log(n_rwdpts),
data = df1)
negbin <- glm.nb(n_fungal ~ race2*covid + ethnicity*covid + rural*covid + age + male +
state + year_center*covid +
offset(log(n_rwdpts)),
data = df1)
library(jtools)
summ(pois, exp = T)
## MODEL INFO:
## Observations: 72638
## Dependent Variable: n_fungal
## Type: Generalized linear model
## Family: poisson
## Link function: log
##
## MODEL FIT:
## χ²(68) = 9658.12, p = 0.00
## Pseudo-R² (Cragg-Uhler) = 0.32
## Pseudo-R² (McFadden) = 0.27
## AIC = 26549.99, BIC = 27184.33
##
## Standard errors: MLE
## -------------------------------------------------------------------------
## exp(Est.) 2.5% 97.5% z val. p
## ------------------------------ ----------- ------ ------- -------- ------
## (Intercept) 0.00 0.00 0.00 -19.31 0.00
## race2American Indian or 0.78 0.59 1.02 -1.81 0.07
## Alaska Native
## race2Asian/Pacific 0.93 0.82 1.05 -1.13 0.26
## Islander
## race2Black or African 1.06 0.98 1.14 1.45 0.15
## American
## race2Other 1.25 1.15 1.35 5.41 0.00
## covid 1.02 0.95 1.08 0.51 0.61
## ethnicityHispanic or 0.84 0.79 0.91 -4.68 0.00
## Latino
## rural1 0.86 0.80 0.91 -4.92 0.00
## male1 1.37 1.33 1.41 21.38 0.00
## age35 to 64 2.56 2.42 2.70 34.04 0.00
## age65 and Over 4.95 4.69 5.23 58.10 0.00
## stateAL 0.68 0.25 1.83 -0.77 0.44
## stateAR 0.65 0.21 2.00 -0.74 0.46
## stateAZ 1.63 0.61 4.35 0.98 0.33
## stateCA 1.43 0.54 3.81 0.71 0.48
## stateCO 0.42 0.16 1.14 -1.71 0.09
## stateCT 0.66 0.25 1.78 -0.81 0.42
## stateDC 0.57 0.21 1.58 -1.08 0.28
## stateDE 1.01 0.38 2.70 0.02 0.98
## stateFL 0.51 0.19 1.35 -1.36 0.17
## stateGA 1.19 0.44 3.20 0.34 0.73
## stateHI 1.15 0.42 3.13 0.27 0.78
## stateIA 0.58 0.22 1.55 -1.09 0.28
## stateID 0.47 0.15 1.41 -1.35 0.18
## stateIL 0.45 0.17 1.21 -1.59 0.11
## stateIN 0.80 0.30 2.13 -0.45 0.65
## stateKS 0.82 0.30 2.25 -0.38 0.70
## stateKY 0.83 0.31 2.23 -0.37 0.71
## stateLA 0.52 0.12 2.33 -0.85 0.39
## stateMA 0.67 0.25 1.80 -0.79 0.43
## stateMD 0.51 0.19 1.39 -1.31 0.19
## stateME 0.53 0.20 1.43 -1.25 0.21
## stateMI 0.81 0.28 2.36 -0.38 0.70
## stateMN 0.65 0.23 1.82 -0.83 0.41
## stateMO 0.63 0.23 1.67 -0.94 0.35
## stateMS 0.59 0.21 1.64 -1.01 0.31
## stateMT 0.66 0.24 1.76 -0.84 0.40
## stateNC 1.68 0.62 4.52 1.03 0.30
## stateND 1.41 0.41 4.81 0.54 0.59
## stateNE 0.59 0.22 1.59 -1.04 0.30
## stateNH 0.68 0.25 1.86 -0.74 0.46
## stateNJ 0.58 0.22 1.55 -1.08 0.28
## stateNM 0.60 0.22 1.63 -1.00 0.32
## stateNV 0.25 0.08 0.79 -2.35 0.02
## stateNY 0.78 0.29 2.07 -0.51 0.61
## stateOH 0.41 0.15 1.11 -1.75 0.08
## stateOK 0.57 0.15 2.11 -0.85 0.40
## stateOR 0.64 0.24 1.73 -0.88 0.38
## statePA 0.63 0.24 1.68 -0.93 0.35
## stateRI 4.97 1.60 15.40 2.78 0.01
## stateSC 1.00 0.37 2.68 0.00 1.00
## stateSD 0.96 0.33 2.77 -0.08 0.94
## stateTN 0.59 0.22 1.57 -1.06 0.29
## stateTX 0.50 0.19 1.34 -1.37 0.17
## stateUT 0.08 0.02 0.30 -3.76 0.00
## stateVA 0.75 0.28 2.01 -0.58 0.56
## stateVT 0.43 0.15 1.18 -1.64 0.10
## stateWA 0.29 0.10 0.81 -2.34 0.02
## stateWI 0.64 0.24 1.71 -0.90 0.37
## stateWV 0.63 0.24 1.69 -0.91 0.36
## stateWY 0.49 0.18 1.35 -1.38 0.17
## year_center 1.05 1.04 1.07 9.39 0.00
## race2American Indian or 1.15 0.80 1.65 0.77 0.44
## Alaska Native:covid
## race2Asian/Pacific 1.28 1.09 1.50 3.08 0.00
## Islander:covid
## race2Black or African 1.04 0.94 1.16 0.83 0.41
## American:covid
## race2Other:covid 1.00 0.90 1.11 -0.01 0.99
## covid:ethnicityHispanic or 1.18 1.08 1.30 3.48 0.00
## Latino
## covid:rural1 1.06 0.98 1.15 1.40 0.16
## covid:year_center 0.95 0.93 0.97 -4.89 0.00
## -------------------------------------------------------------------------
summ(logbin, exp = T)
## MODEL INFO:
## Observations: 72638
## Dependent Variable: cbind(n_fungal, n_rwdpts)
## Type: Generalized linear model
## Family: binomial
## Link function: log
##
## MODEL FIT:
## χ²(68) = 9656.22, p = 0.00
## Pseudo-R² (Cragg-Uhler) = 0.32
## Pseudo-R² (McFadden) = 0.27
## AIC = 26536.39, BIC = 27170.72
##
## Standard errors: MLE
## -------------------------------------------------------------------------
## exp(Est.) 2.5% 97.5% z val. p
## ------------------------------ ----------- ------ ------- -------- ------
## (Intercept) 0.00 0.00 0.00 -19.32 0.00
## race2American Indian or 0.78 0.59 1.02 -1.81 0.07
## Alaska Native
## race2Asian/Pacific 0.93 0.82 1.05 -1.13 0.26
## Islander
## race2Black or African 1.06 0.98 1.14 1.45 0.15
## American
## race2Other 1.25 1.15 1.35 5.41 0.00
## covid 1.02 0.95 1.08 0.51 0.61
## ethnicityHispanic or 0.84 0.79 0.91 -4.68 0.00
## Latino
## rural1 0.86 0.80 0.91 -4.92 0.00
## male1 1.37 1.33 1.41 21.38 0.00
## age35 to 64 2.56 2.42 2.70 34.04 0.00
## age65 and Over 4.95 4.69 5.23 58.09 0.00
## stateAL 0.68 0.25 1.83 -0.77 0.44
## stateAR 0.65 0.21 2.00 -0.74 0.46
## stateAZ 1.63 0.61 4.35 0.98 0.33
## stateCA 1.43 0.54 3.81 0.71 0.48
## stateCO 0.42 0.16 1.14 -1.71 0.09
## stateCT 0.66 0.25 1.78 -0.81 0.42
## stateDC 0.57 0.21 1.58 -1.08 0.28
## stateDE 1.01 0.38 2.70 0.02 0.98
## stateFL 0.51 0.19 1.35 -1.36 0.17
## stateGA 1.19 0.44 3.20 0.34 0.73
## stateHI 1.15 0.42 3.13 0.27 0.78
## stateIA 0.58 0.22 1.55 -1.09 0.28
## stateID 0.47 0.15 1.41 -1.35 0.18
## stateIL 0.45 0.17 1.21 -1.59 0.11
## stateIN 0.80 0.30 2.13 -0.45 0.65
## stateKS 0.82 0.30 2.25 -0.38 0.70
## stateKY 0.83 0.31 2.22 -0.37 0.71
## stateLA 0.52 0.12 2.33 -0.85 0.39
## stateMA 0.67 0.25 1.80 -0.79 0.43
## stateMD 0.51 0.19 1.39 -1.31 0.19
## stateME 0.53 0.20 1.43 -1.25 0.21
## stateMI 0.81 0.28 2.36 -0.38 0.70
## stateMN 0.65 0.23 1.82 -0.83 0.41
## stateMO 0.63 0.23 1.67 -0.94 0.35
## stateMS 0.59 0.21 1.64 -1.01 0.31
## stateMT 0.66 0.24 1.76 -0.84 0.40
## stateNC 1.68 0.62 4.52 1.03 0.30
## stateND 1.41 0.41 4.81 0.54 0.59
## stateNE 0.59 0.22 1.59 -1.04 0.30
## stateNH 0.68 0.25 1.86 -0.74 0.46
## stateNJ 0.58 0.22 1.55 -1.08 0.28
## stateNM 0.60 0.22 1.63 -1.00 0.32
## stateNV 0.25 0.08 0.79 -2.35 0.02
## stateNY 0.78 0.29 2.07 -0.51 0.61
## stateOH 0.41 0.15 1.11 -1.75 0.08
## stateOK 0.57 0.15 2.11 -0.85 0.40
## stateOR 0.64 0.24 1.73 -0.88 0.38
## statePA 0.63 0.24 1.68 -0.93 0.35
## stateRI 4.96 1.60 15.38 2.77 0.01
## stateSC 1.00 0.37 2.68 0.00 1.00
## stateSD 0.96 0.33 2.77 -0.08 0.94
## stateTN 0.59 0.22 1.57 -1.06 0.29
## stateTX 0.50 0.19 1.34 -1.37 0.17
## stateUT 0.08 0.02 0.30 -3.76 0.00
## stateVA 0.75 0.28 2.01 -0.58 0.56
## stateVT 0.43 0.15 1.18 -1.64 0.10
## stateWA 0.29 0.10 0.81 -2.35 0.02
## stateWI 0.64 0.24 1.71 -0.90 0.37
## stateWV 0.63 0.24 1.69 -0.91 0.36
## stateWY 0.49 0.18 1.35 -1.38 0.17
## year_center 1.05 1.04 1.07 9.39 0.00
## race2American Indian or 1.15 0.80 1.65 0.77 0.44
## Alaska Native:covid
## race2Asian/Pacific 1.28 1.09 1.50 3.08 0.00
## Islander:covid
## race2Black or African 1.04 0.94 1.16 0.83 0.41
## American:covid
## race2Other:covid 1.00 0.90 1.11 -0.01 0.99
## covid:ethnicityHispanic or 1.18 1.08 1.30 3.48 0.00
## Latino
## covid:rural1 1.06 0.98 1.15 1.40 0.16
## covid:year_center 0.95 0.93 0.97 -4.89 0.00
## -------------------------------------------------------------------------
tmp <- data.frame(pois = AIC(pois), logbin = AIC(logbin), negb = AIC(negbin))
knitr::kable(tmp, align = 'l')
| pois | logbin | negb |
|---|---|---|
| 26549.99 | 26536.39 | 26363.42 |
Negative binomial lower than Poisson, but not by much - will check dispersion
library(AER)
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
dispersiontest(pois)
##
## Overdispersion test
##
## data: pois
## z = 6.3161, p-value = 1.341e-10
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion
## 1.03155
Check theta for negative binomial
summary(negbin)$theta
## [1] 13.94523
Likeliehood ratio test
library(lmtest)
lrtest(pois, negbin)
## Likelihood ratio test
##
## Model 1: n_fungal ~ race2 * covid + ethnicity * covid + rural * covid +
## male + age + state + year_center * covid
## Model 2: n_fungal ~ race2 * covid + ethnicity * covid + rural * covid +
## age + male + state + year_center * covid + offset(log(n_rwdpts))
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 69 -13206
## 2 70 -13112 1 188.58 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Poisson isn’t too overdispersed, but negative binomial fits better. Negative binomial has a high theta - will use Quasi-Poisson as a middle ground.
# Extract model coefficients
regression_results <- tidy(qpois)
# Compute robust standard errors
library(sandwich)
robust_se <- vcovHC(qpois, type = "HC0")
# Extract test statistics and coefficients with robust SEs
regression_results <- coeftest(qpois, vcov. = robust_se)
# combine into dataframe
robust_results <- data.frame(
Term = rownames(regression_results),
Estimate = regression_results[, 1],
Robust_SE = regression_results[, 2],
z_value = regression_results[, 3],
p_values = regression_results[, 4]
)
# add 95% confidence intervals
z_value <- 1.96
robust_results$conf.low <- robust_results$Estimate - z_value * robust_results$Robust_SE
robust_results$conf.high <- robust_results$Estimate + z_value * robust_results$Robust_SE
print(robust_results)
## Term
## (Intercept) (Intercept)
## race2American Indian or Alaska Native race2American Indian or Alaska Native
## race2Asian/Pacific Islander race2Asian/Pacific Islander
## race2Black or African American race2Black or African American
## race2Other race2Other
## covid covid
## ethnicityHispanic or Latino ethnicityHispanic or Latino
## rural1 rural1
## male1 male1
## age35 to 64 age35 to 64
## age65 and Over age65 and Over
## stateAL stateAL
## stateAR stateAR
## stateAZ stateAZ
## stateCA stateCA
## stateCO stateCO
## stateCT stateCT
## stateDC stateDC
## stateDE stateDE
## stateFL stateFL
## stateGA stateGA
## stateHI stateHI
## stateIA stateIA
## stateID stateID
## stateIL stateIL
## stateIN stateIN
## stateKS stateKS
## stateKY stateKY
## stateLA stateLA
## stateMA stateMA
## stateMD stateMD
## stateME stateME
## stateMI stateMI
## stateMN stateMN
## stateMO stateMO
## stateMS stateMS
## stateMT stateMT
## stateNC stateNC
## stateND stateND
## stateNE stateNE
## stateNH stateNH
## stateNJ stateNJ
## stateNM stateNM
## stateNV stateNV
## stateNY stateNY
## stateOH stateOH
## stateOK stateOK
## stateOR stateOR
## statePA statePA
## stateRI stateRI
## stateSC stateSC
## stateSD stateSD
## stateTN stateTN
## stateTX stateTX
## stateUT stateUT
## stateVA stateVA
## stateVT stateVT
## stateWA stateWA
## stateWI stateWI
## stateWV stateWV
## stateWY stateWY
## year_center year_center
## race2American Indian or Alaska Native:covid race2American Indian or Alaska Native:covid
## race2Asian/Pacific Islander:covid race2Asian/Pacific Islander:covid
## race2Black or African American:covid race2Black or African American:covid
## race2Other:covid race2Other:covid
## covid:ethnicityHispanic or Latino covid:ethnicityHispanic or Latino
## covid:rural1 covid:rural1
## covid:year_center covid:year_center
## Estimate Robust_SE
## (Intercept) -9.6743739582 0.470612554
## race2American Indian or Alaska Native -0.2532650010 0.151273266
## race2Asian/Pacific Islander -0.0703069117 0.065412069
## race2Black or African American 0.0557086992 0.045369077
## race2Other 0.2194497636 0.047317444
## covid 0.0167282702 0.039765956
## ethnicityHispanic or Latino -0.1696088898 0.041652133
## rural1 -0.1561064829 0.035007879
## male1 0.3148675579 0.018243694
## age35 to 64 0.9393316989 0.032824259
## age65 and Over 1.5998037177 0.032250863
## stateAL -0.3929021527 0.479904883
## stateAR -0.4254889516 0.540706994
## stateAZ 0.4894299148 0.469887065
## stateCA 0.3570819605 0.469537814
## stateCO -0.8684757307 0.476527163
## stateCT -0.4084976797 0.472351297
## stateDC -0.5579168015 0.487412786
## stateDE 0.0097208047 0.472581687
## stateFL -0.6828487635 0.473466526
## stateGA 0.1730360096 0.473665453
## stateHI 0.1402366510 0.480667044
## stateIA -0.5488381444 0.473827132
## stateID -0.7644695807 0.539450031
## stateIL -0.8023579897 0.473894432
## stateIN -0.2241826710 0.470124400
## stateKS -0.1972103963 0.484760117
## stateKY -0.1874848292 0.473811433
## stateLA -0.6499887778 0.745213634
## stateMA -0.3977089041 0.473102698
## stateMD -0.6734540895 0.483482809
## stateME -0.6310924906 0.472892500
## stateMI -0.2070061209 0.513752910
## stateMN -0.4356260918 0.491831289
## stateMO -0.4693362447 0.471096403
## stateMS -0.5244278805 0.490056983
## stateMT -0.4214675740 0.473556054
## stateNC 0.5186884192 0.481841700
## stateND 0.3414511638 0.627327281
## stateNE -0.5236689415 0.474960238
## stateNH -0.3786500582 0.482400013
## stateNJ -0.5457240709 0.474127909
## stateNM -0.5104445482 0.480519807
## stateNV -1.3739317385 0.550869149
## stateNY -0.2548073744 0.470525780
## stateOH -0.8847188783 0.475777942
## stateOK -0.5702655433 0.639763489
## stateOR -0.4465410131 0.475379668
## statePA -0.4657807985 0.470602631
## stateRI 1.6029002877 0.569018770
## stateSC 0.0008069862 0.473786397
## stateSD -0.0417943517 0.522056578
## stateTN -0.5326122858 0.474391187
## stateTX -0.6895774005 0.472222801
## stateUT -2.5228856034 0.640088493
## stateVA -0.2914993159 0.477969399
## stateVT -0.8509490293 0.483173816
## stateWA -1.2507620547 0.504706242
## stateWI -0.4527100029 0.475831621
## stateWV -0.4590467532 0.472110386
## stateWY -0.7151720435 0.491380402
## year_center 0.0530192076 0.006823566
## race2American Indian or Alaska Native:covid 0.1413055255 0.192943375
## race2Asian/Pacific Islander:covid 0.2491547529 0.081789780
## race2Black or African American:covid 0.0436001682 0.067393929
## race2Other:covid -0.0007085809 0.067140078
## covid:ethnicityHispanic or Latino 0.1661129490 0.059889128
## covid:rural1 0.0580635678 0.046409306
## covid:year_center -0.0529290078 0.013641602
## z_value p_values
## (Intercept) -20.55698235 6.665151e-94
## race2American Indian or Alaska Native -1.67422181 9.408701e-02
## race2Asian/Pacific Islander -1.07483087 2.824505e-01
## race2Black or African American 1.22790021 2.194844e-01
## race2Other 4.63781944 3.521042e-06
## covid 0.42066813 6.739974e-01
## ethnicityHispanic or Latino -4.07203366 4.660444e-05
## rural1 -4.45918139 8.227327e-06
## male1 17.25898066 9.578693e-67
## age35 to 64 28.61699649 4.129115e-180
## age65 and Over 49.60498880 0.000000e+00
## stateAL -0.81870839 4.129528e-01
## stateAR -0.78691224 4.313332e-01
## stateAZ 1.04159053 2.976016e-01
## stateCA 0.76049670 4.469577e-01
## stateCO -1.82251044 6.837757e-02
## stateCT -0.86481752 3.871390e-01
## stateDC -1.14464950 2.523544e-01
## stateDE 0.02056958 9.835890e-01
## stateFL -1.44223240 1.492368e-01
## stateGA 0.36531271 7.148780e-01
## stateHI 0.29175425 7.704745e-01
## stateIA -1.15830882 2.467380e-01
## stateID -1.41712770 1.564456e-01
## stateIL -1.69311546 9.043349e-02
## stateIN -0.47685819 6.334631e-01
## stateKS -0.40682059 6.841398e-01
## stateKY -0.39569503 6.923300e-01
## stateLA -0.87221804 3.830894e-01
## stateMA -0.84063969 4.005498e-01
## stateMD -1.39292251 1.636432e-01
## stateME -1.33453690 1.820280e-01
## stateMI -0.40292934 6.870002e-01
## stateMN -0.88572261 3.757670e-01
## stateMO -0.99626370 3.191220e-01
## stateMS -1.07013653 2.845579e-01
## stateMT -0.89000567 3.734628e-01
## stateNC 1.07647059 2.817168e-01
## stateND 0.54429510 5.862384e-01
## stateNE -1.10255322 2.702212e-01
## stateNH -0.78492962 4.324948e-01
## stateNJ -1.15100601 2.497298e-01
## stateNM -1.06227577 2.881105e-01
## stateNV -2.49411633 1.262712e-02
## stateNY -0.54153754 5.881371e-01
## stateOH -1.85952059 6.295338e-02
## stateOK -0.89136932 3.727311e-01
## stateOR -0.93933553 3.475585e-01
## statePA -0.98975392 3.222944e-01
## stateRI 2.81695503 4.848131e-03
## stateSC 0.00170327 9.986410e-01
## stateSD -0.08005713 9.361918e-01
## stateTN -1.12272804 2.615530e-01
## stateTX -1.46027977 1.442132e-01
## stateUT -3.94146377 8.098587e-05
## stateVA -0.60987025 5.419478e-01
## stateVT -1.76116544 7.821040e-02
## stateWA -2.47819811 1.320478e-02
## stateWI -0.95140798 3.413973e-01
## stateWV -0.97232928 3.308868e-01
## stateWY -1.45543461 1.455490e-01
## year_center 7.77001444 7.847718e-15
## race2American Indian or Alaska Native:covid 0.73236785 4.639441e-01
## race2Asian/Pacific Islander:covid 3.04628221 2.316903e-03
## race2Black or African American:covid 0.64694504 5.176675e-01
## race2Other:covid -0.01055377 9.915795e-01
## covid:ethnicityHispanic or Latino 2.77367455 5.542710e-03
## covid:rural1 1.25111908 2.108910e-01
## covid:year_center -3.87997020 1.044693e-04
## conf.low conf.high
## (Intercept) -10.59677456 -8.75197335
## race2American Indian or Alaska Native -0.54976060 0.04323060
## race2Asian/Pacific Islander -0.19851457 0.05790074
## race2Black or African American -0.03321469 0.14463209
## race2Other 0.12670757 0.31219195
## covid -0.06121300 0.09466954
## ethnicityHispanic or Latino -0.25124707 -0.08797071
## rural1 -0.22472193 -0.08749104
## male1 0.27910992 0.35062520
## age35 to 64 0.87499615 1.00366725
## age65 and Over 1.53659203 1.66301541
## stateAL -1.33351572 0.54771142
## stateAR -1.48527466 0.63429676
## stateAZ -0.43154873 1.41040856
## stateCA -0.56321216 1.27737608
## stateCO -1.80246897 0.06551751
## stateCT -1.33430622 0.51731086
## stateDC -1.51324586 0.39741226
## stateDE -0.91653930 0.93598091
## stateFL -1.61084315 0.24514563
## stateGA -0.75534828 1.10142030
## stateHI -0.80187075 1.08234406
## stateIA -1.47753932 0.37986303
## stateID -1.82179164 0.29285248
## stateIL -1.73119108 0.12647510
## stateIN -1.14562650 0.69726115
## stateKS -1.14734022 0.75291943
## stateKY -1.11615524 0.74118558
## stateLA -2.11060750 0.81062994
## stateMA -1.32499019 0.52957238
## stateMD -1.62108039 0.27417222
## stateME -1.55796179 0.29577681
## stateMI -1.21396182 0.79994958
## stateMN -1.39961542 0.52836323
## stateMO -1.39268520 0.45401271
## stateMS -1.48493957 0.43608381
## stateMT -1.34963744 0.50670229
## stateNC -0.42572131 1.46309815
## stateND -0.88811031 1.57101263
## stateNE -1.45459101 0.40725312
## stateNH -1.32415408 0.56685397
## stateNJ -1.47501477 0.38356663
## stateNM -1.45226337 0.43137427
## stateNV -2.45363527 -0.29422821
## stateNY -1.17703790 0.66742315
## stateOH -1.81724364 0.04780589
## stateOK -1.82420198 0.68367089
## stateOR -1.37828516 0.48520314
## statePA -1.38816196 0.45660036
## stateRI 0.48762350 2.71817708
## stateSC -0.92781435 0.92942832
## stateSD -1.06502525 0.98143654
## stateTN -1.46241901 0.39719444
## stateTX -1.61513409 0.23597929
## stateUT -3.77745905 -1.26831216
## stateVA -1.22831934 0.64532071
## stateVT -1.79796971 0.09607165
## stateWA -2.23998629 -0.26153782
## stateWI -1.38533998 0.47991997
## stateWV -1.38438311 0.46628960
## stateWY -1.67827763 0.24793354
## year_center 0.03964502 0.06639340
## race2American Indian or Alaska Native:covid -0.23686349 0.51947454
## race2Asian/Pacific Islander:covid 0.08884678 0.40946272
## race2Black or African American:covid -0.08849193 0.17569227
## race2Other:covid -0.13230313 0.13088597
## covid:ethnicityHispanic or Latino 0.04873026 0.28349564
## covid:rural1 -0.03289867 0.14902581
## covid:year_center -0.07966655 -0.02619147
library(multcomp)
## Loading required package: mvtnorm
##
## Attaching package: 'mvtnorm'
## The following object is masked from 'package:jtools':
##
## standardize
## Loading required package: TH.data
##
## Attaching package: 'TH.data'
## The following object is masked from 'package:MASS':
##
## geyser
# set up named linear combination for the post-COVID slope
K_named <- setNames(rep(0, length(coef(qpois))), names(coef(qpois)))
K_named["year_center"] <- 1
K_named["covid:year_center"] <- 1
post_slope_glht <- glht(qpois, linfct = rbind(K_named))
# summary and confidence interval
summary(post_slope_glht)
##
## Simultaneous Tests for General Linear Hypotheses
##
## Fit: glm(formula = n_fungal ~ race2 * covid + ethnicity * covid +
## rural * covid + male + age + state + year_center * covid,
## family = "quasipoisson", data = df1, offset = log(n_rwdpts))
##
## Linear Hypotheses:
## Estimate Std. Error z value Pr(>|z|)
## K_named == 0 0.0000902 0.0076414 0.012 0.991
## (Adjusted p values reported -- single-step method)
confint(post_slope_glht)
##
## Simultaneous Confidence Intervals
##
## Fit: glm(formula = n_fungal ~ race2 * covid + ethnicity * covid +
## rural * covid + male + age + state + year_center * covid,
## family = "quasipoisson", data = df1, offset = log(n_rwdpts))
##
## Quantile = 1.96
## 95% family-wise confidence level
##
##
## Linear Hypotheses:
## Estimate lwr upr
## K_named == 0 0.0000902 -0.0148867 0.0150671
# robust SE
vcov_robust <- vcovHC(qpois, type = "HC0")
glht(qpois, linfct = rbind(K_named), vcov = vcov_robust)
##
## General Linear Hypotheses
##
## Linear Hypotheses:
## Estimate
## K_named == 0 9.02e-05
# RR instead of log, remove state and year variables
robust_results <- robust_results %>%
mutate(RR = exp(Estimate),
conf.low = exp(conf.low),
conf.high = exp(conf.high)) %>%
filter(str_detect(Term, "Intercept|race2|ethnicity|rural|male|age35|age65|covid|year"))
p_forest_regression <- ggplot(robust_results, aes(x = reorder(Term, RR),
y = RR, ymin = conf.low,
ymax = conf.high)) +
geom_pointrange(size = 1.2, color = "black") + # No p-value coloring
geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
coord_flip() +
theme_minimal() +
labs(y = "Rate Ratio (RR)", x = "Predictors",
title = "Estimated Rate Ratios with Robust Standard Errors",
subtitle = "Confidence Intervals for Each Predictor")
p_forest_regression
Did the COVID pandemic deferentially affect aspergillosis diagnosis rates among certain demographic subgroups?
Estimate marginal means for interaction between race, ethnicity, rurality, and covid
library(emmeans)
## Welcome to emmeans.
## Caution: You lose important information if you filter this package's results.
## See '? untidy'
# get ratios for different groups
emm_options(rg.limit = 50000)
emm <- emmeans(qpois, ~ race2*ethnicity*rural*covid,
at = list(year_center = 0),
type = "response")
# compare Post- vs. Pre-COVID within each group
contrast_results <- contrast(emm, "revpairwise", by = c("race2", "ethnicity", "rural"),
adjust = "sidak")
contrast_summary <- summary(contrast_results, infer = c(TRUE, TRUE), type = "response")
contrast_df <- as.data.frame(contrast_summary) %>%
mutate(rural = ifelse(rural == "0", "urban", "rural"), # replace binary indicator with label
group = paste(race2, ethnicity, rural, sep = " - "), # combine groups with separators
p.value = signif(p.value, 3)) # format p-values to 3 digits
If RR > 1, aspergillosis rate increased post-COVID
If RR < 1, aspergillosis rate decreased post-COVID
~ 1, the rate stayed the same
Visualize results
p_forest <- ggplot(contrast_df, aes(x = group, y = ratio, ymin = asymp.LCL,
ymax = asymp.UCL, color = p.value < 0.05)) +
geom_pointrange(size = 1.2) + # confidence intervals
geom_hline(yintercept = 0, linetype = "dashed", color = "red") + # reference line at no change (0)
coord_flip() +
theme_minimal() +
labs(y = "Change in Rate Ratio (Post-COVID vs. Pre-COVID)", x = "Group",
title = "Post- vs. Pre-COVID Aspergillosis Rate Ratio Changes",
subtitle = "Statistically significant comparisons (p < 0.05) are highlighted") +
scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black"))
p_forest
Average for rural patients
contrast_df_na <- na.omit(contrast_df)
contrast_df_rural <- contrast_df_na %>%
group_by(rural) %>%
summarize(
avg_ratio = mean(ratio),
conf_low = mean(asymp.LCL),
conf.high = mean(asymp.UCL)
)
Boxplot - rural
p_box <- ggplot(contrast_df, aes(x = race2, y = ratio, fill = rural)) +
geom_boxplot(width = 0.4, position = position_dodge(width = 0.5)) +
theme_minimal() +
labs(y = "aPR Change", x = "Racial Group",
title = "Comparison of PR Changes Across Race & Rural Status",
legend.title = "Urbanicity",
fill = "Residence Type") +
scale_fill_manual(values = c("rural" = "#D67236",
"urban" = "#EAD3BF"))
p_box
# ggsave("/Users/brittanymorgan/Desktop/Cerner Data/Aspergillosis_3_6_25/Plots/rural_box.png", plot = p_box, width = 12, dpi = 300)
Average over race group to get ethnicity total
contrast_df_eth <- contrast_df %>%
group_by(ethnicity) %>%
summarize(
avg_ratio = mean(ratio),
conf_low = mean(asymp.LCL),
conf.high = mean(asymp.UCL)
)
Boxplot - race and ethnicity
p_box3 <- ggplot(contrast_df, aes(x = race2, y = ratio, fill = ethnicity)) +
geom_boxplot(width = 0.4, position = position_dodge(width = 0.2)) +
theme_minimal() +
labs(y = "aPR Change", x = "Racial Group",
title = "Comparison of PR Changes Across Race & Ethnicity Status",
fill = "Ethnicity") +
scale_fill_manual(values = c("Non-Hispanic" = "#E3B710",
"Hispanic or Latino" = "#54A5B9"),
labels = c("Non-Hispanic" = "Non-Hispanic or Latino",
"Hispanic or Latino" = "Hispanic or Latino"))
p_box3
# ggsave("/Users/brittanymorgan/Desktop/Cerner Data/Aspergillosis_3_6_25/Plots/ethnicity_box.png", plot = p_box3, width = 12, dpi = 300)
Sort by RR change
contrast_df <- contrast_df %>%
arrange(desc(ratio))
Test for trends across race groups, controlling for ethnicity
lm_model <- lm(ratio ~ race2 + ethnicity, data = contrast_df)
summary(lm_model)
##
## Call:
## lm(formula = ratio ~ race2 + ethnicity, data = contrast_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.056712 -0.037101 0.000015 0.033633 0.063758
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.0382075 0.0249027 41.691 4.37e-16
## race2American Indian or Alaska Native 0.1733118 0.0321492 5.391 9.52e-05
## race2Asian/Pacific Islander 0.3230865 0.0321492 10.050 8.79e-08
## race2Black or African American 0.0508878 0.0321492 1.583 0.136
## race2Other -0.0008088 0.0321492 -0.025 0.980
## ethnicityHispanic or Latino 0.2073612 0.0203330 10.198 7.32e-08
##
## (Intercept) ***
## race2American Indian or Alaska Native ***
## race2Asian/Pacific Islander ***
## race2Black or African American
## race2Other
## ethnicityHispanic or Latino ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04547 on 14 degrees of freedom
## Multiple R-squared: 0.9477, Adjusted R-squared: 0.929
## F-statistic: 50.71 on 5 and 14 DF, p-value: 1.783e-08
American Indian or Alaska Native and Asian/Pacific Islander had higher rate ratios (17% and 32%, respectively) than White individuals. Hispanic or Latino ethnicity had higher rate ratios (21%).
Check Fit
# check residual deviance
deviance(qpois) / df.residual(qpois)
## [1] 0.1846822
Likelihood ratio test
lrtest(qpois, negbin)
## Likelihood ratio test
##
## Model 1: n_fungal ~ race2 * covid + ethnicity * covid + rural * covid +
## male + age + state + year_center * covid
## Model 2: n_fungal ~ race2 * covid + ethnicity * covid + rural * covid +
## age + male + state + year_center * covid + offset(log(n_rwdpts))
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 70
## 2 70 -13112 0
Check VIF
VIF does not depend on response variable distribution, so can
be computed from a Poisson model and applied to QPoisson
# create model matrix (excluding intercept)
X <- model.matrix(~ race2 * covid + ethnicity * covid + rural * covid + male + state, data = df1)[, -1]
# compute Variance Inflation Factor manually
vif_manual <- diag(solve(cor(X))) # Inverse of correlation matrix
print(vif_manual)
## race2American Indian or Alaska Native
## 2.190987
## race2Asian/Pacific Islander
## 2.235495
## race2Black or African American
## 2.268267
## race2Other
## 2.291185
## covid
## 4.940847
## ethnicityHispanic or Latino
## 1.690401
## rural1
## 1.683494
## male1
## 1.002470
## stateAL
## 2.312752
## stateAR
## 1.934501
## stateAZ
## 2.860342
## stateCA
## 3.059482
## stateCO
## 2.465622
## stateCT
## 2.564149
## stateDC
## 1.886334
## stateDE
## 2.320816
## stateFL
## 2.607875
## stateGA
## 2.316290
## stateHI
## 2.096909
## stateIA
## 2.528445
## stateID
## 1.939010
## stateIL
## 2.552520
## stateIN
## 2.644641
## stateKS
## 2.217563
## stateKY
## 2.380409
## stateLA
## 1.699608
## stateMA
## 2.550047
## stateMD
## 2.366866
## stateME
## 2.477718
## stateMI
## 1.971938
## stateMN
## 2.096119
## stateMO
## 2.730403
## stateMS
## 2.148887
## stateMT
## 2.350814
## stateNC
## 2.199568
## stateND
## 1.725493
## stateNE
## 2.446277
## stateNH
## 2.061078
## stateNJ
## 2.598385
## stateNM
## 2.605087
## stateNV
## 2.206400
## stateNY
## 2.641685
## stateOH
## 2.526006
## stateOK
## 1.921465
## stateOR
## 2.450313
## statePA
## 2.584870
## stateRI
## 1.513530
## stateSC
## 2.446383
## stateSD
## 2.001837
## stateTN
## 2.204163
## stateTX
## 2.755818
## stateUT
## 2.174149
## stateVA
## 2.410477
## stateVT
## 1.974936
## stateWA
## 2.232801
## stateWI
## 2.355739
## stateWV
## 2.226638
## stateWY
## 2.185249
## race2American Indian or Alaska Native:covid
## 2.444151
## race2Asian/Pacific Islander:covid
## 2.537198
## race2Black or African American:covid
## 2.613060
## race2Other:covid
## 2.688884
## covid:ethnicityHispanic or Latino
## 2.381220
## covid:rural1
## 2.273979
Check residuals
# Extract residuals and fitted values
df_resid <- data.frame(Fitted = fitted(qpois), Residuals = residuals(qpois))
# Plot residuals
ggplot(df_resid, aes(x = Fitted, y = Residuals)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "loess", color = "red") +
theme_minimal() +
labs(title = "Residuals vs Fitted Values")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Failed to fit group -1.
## Caused by error in `predLoess()`:
## ! workspace required (7915126834) is too large probably because of setting 'se = TRUE'.